rm(list=ls(all=TRUE))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)

Metabolome dataset

Read combined metabolomics dataset:

data_wide <- read_csv("Data/data_final.csv")
## Rows: 44570 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Feature_ID, Dataset
## dbl (78): Row_ID, Row_MZ, Row_RT, Correlation_Group_ID, EMR_04_10_MD, EMR_04...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Make separate table for feature metadata:

feature_info <- data_wide %>% 
  select("Feature_ID", "Dataset", "Row_ID", "Row_MZ", "Row_RT", "Correlation_Group_ID")

feature_info

Transform into long format:

data_long <- data_wide %>% 
  select(Feature_ID, starts_with("EMR_04")) %>% 
  pivot_longer(cols = c(starts_with("EMR_04")), names_to = "Sample_ID", values_to = "Log10_Ratio")

rm(data_wide)

data_long

Sample metadata

Read sample metadata:

sample_info <- read_tsv("Data/sample_metadata.tsv")
## Rows: 74 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): Sample_Identifier, SampleCollectionDateandTime, Metabolomics_Data,...
## dbl  (3): AgeInYears, Sebumeter_Score, Skicon_Score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Select and rename colums:

sample_info <- sample_info %>% 
  select(Sample_ID = Sample_Identifier, Study_Group = ATTRIBUTE_Study_Group, 
    Age = AgeInYears, Sex = ATTRIBUTE_BiologicalSex, Ethnic_Group = ATTRIBUTE_Ethnic_Group,
    Sebumeter_Score, Skicon_Score)

sample_info

Sample numbers:

sample_info %>% 
  count(Study_Group)

Check distribution of numerical parameters:

sample_info <- sample_info %>% 
  mutate(
    Log10_Sebumeter_Score = log10(Sebumeter_Score),
    Log10_Skicon_Score    = log10(Skicon_Score)
    )

sample_info %>% 
  pivot_longer(
    cols = c(
      Sebumeter_Score, Log10_Sebumeter_Score, 
      Skicon_Score, Log10_Skicon_Score, 
      Age
     ),
    names_to = "Parameter", values_to = "Value"
  ) %>% 
  filter(!is.na(Value)) %>% 
  mutate(
    Parameter = factor(
      Parameter, 
      levels = c(
        "Sebumeter_Score", "Log10_Sebumeter_Score", 
        "Skicon_Score", "Log10_Skicon_Score", 
        "Age"
        )
      )
    ) %>% 
  ggplot() +
  geom_histogram(aes(Value), bins = 10) +
  facet_wrap("Parameter", ncol = 2, scales = "free")

Statistical analysis

Nested tibble:

data_nested <- data_long %>% 
  inner_join(
    sample_info,
    by = "Sample_ID"
  ) %>% 
  group_by(Feature_ID) %>% 
  nest() %>% 
  ungroup()

data_nested$data[[1]]

Contrast between oily and normal scalp

Study_Group is the variable of interest, Age, Sex, Ethnic_Group, and Log10_Skicon_Score are confounding variables.

Fit linear models:

data_nested <- data_nested %>% 
  mutate(
    Models = data %>% map(
      ~ lm(Log10_Ratio ~ Study_Group + Age + Sex + Ethnic_Group + Log10_Skicon_Score,
           data = ., na.action = na.omit) %>% try()
    ),
    Classes = Models %>% map_chr(~ paste(class(.), collapse = " "))
  )
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
##   contrasts can be applied only to factors with 2 or more levels
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
##   contrasts can be applied only to factors with 2 or more levels
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
##   contrasts can be applied only to factors with 2 or more levels
data_nested %>% count(Classes)

Extract model coefficients:

stats_study_group <- data_nested %>% 
  filter(Classes == "lm") %>% 
  mutate(
    Coefficients = Models %>% map(tidy)
  ) %>% 
  select(Feature_ID, Coefficients) %>% 
  unnest(Coefficients) %>% 
  filter(term == "Study_GroupOily Scalp")

stats_study_group

Correlation with Sebumeter score:

Sebumeter_Score + Log10_Skicon_Score are the variables of interest, Age, Sex, and Ethnic_Group are confounding variables.

Fit linear models:

data_nested <- data_nested %>% 
  mutate(
    Models = data %>% map(
      ~ lm(Log10_Ratio ~ Age + Sex + Ethnic_Group + Sebumeter_Score + Log10_Skicon_Score,
           data = ., na.action = na.omit) %>% try()
    ),
    Classes = Models %>% map_chr(~ paste(class(.), collapse = " "))
  )
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
##   contrasts can be applied only to factors with 2 or more levels
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
##   contrasts can be applied only to factors with 2 or more levels
data_nested %>% count(Classes)

Extract model coefficients:

stats_physical_skin_parameters <- data_nested %>% 
  filter(Classes == "lm") %>% 
  mutate(
    Coefficients = Models %>% map(tidy)
  ) %>% 
  select(Feature_ID, Coefficients) %>% 
  unnest(Coefficients) %>% 
  filter(term == "Sebumeter_Score" | term == "Log10_Skicon_Score")

stats_physical_skin_parameters

Formatted stats results

Combine stats results:

feature_stats <- bind_rows(stats_study_group, stats_physical_skin_parameters)

rm(data_nested, stats_study_group, stats_physical_skin_parameters)

feature_stats

Add contrast labels and format stats results for Cytoscape:

feature_stats <- feature_stats %>% 
  mutate(
    p.volcano  = -log10(p.value),
    p.volc.dir = p.volcano * sign(statistic),
    Contrast_Label = term %>% case_match(
      "Study_GroupOily Scalp" ~ "groups|oily-norm",
      "Sebumeter_Score" ~ "base|sebum",
      "Log10_Skicon_Score" ~ "base|skicon"
      ),
    p_value    = p.value    %>% round(digits = 3) %>% as.character(),
    p_volcano  = p.volcano  %>% round(digits = 3) %>% as.character(),
    p_volc_dir = p.volc.dir %>% round(digits = 3) %>% as.character(),
    p_cat      = case_when(
      p.value < 0.001 ~ "< 0.001",
      p.value <  0.01 ~ "< 0.01",
      p.value <   0.1 ~ "< 0.1"
    ),
    p_cat_dir  = case_when(
      is.na(p_cat)  ~ NA_character_,
      statistic > 0 ~ paste0("Up, ", p_cat),
      statistic < 0 ~ paste0("Dn, ", p_cat)
    )
  )

feature_stats

Output for Cytoscape

Combine stats results with feature info and write to file:

feature_stats_info <- feature_info %>% 
  select(Feature_ID, Dataset, Row_ID) %>% 
  inner_join(
    feature_stats %>% select(Feature_ID, Contrast_Label, starts_with("p_")),
    by = "Feature_ID"
    )

feature_stats_info %>% write_csv("Analysis/feature_stats.csv")

feature_stats_info

Write results for to file for loading into Cytoscape (individual formats, individual datasets):

feature_stats_info %>% 
  pivot_longer(cols = starts_with("p_"), names_to = "Format", values_to = "Value", values_drop_na = T) %>% 
  pivot_wider(names_from = Contrast_Label, values_from = Value) %>% 
  select(Dataset, Format, Row_ID, contains("|")) %>% 
  group_by(Dataset, Format) %>% 
  nest() %>% 
  mutate(
    Filename = str_c(Dataset, Format, "tsv", sep = "."),
    data     = map2(data, Format, \(data, Format) data %>% rename_with(\(x) str_c(Format, x, sep = "|"), contains("|"))),
    data     = map2(data, Filename, \(data, Filename) data %>% write_tsv(str_c("Analysis/", Filename), na = ""))
  )
dir("Analysis")
##  [1] "C18_neg.p_cat_dir.tsv"    "C18_neg.p_cat.tsv"       
##  [3] "C18_neg.p_value.tsv"      "C18_neg.p_volc_dir.tsv"  
##  [5] "C18_neg.p_volcano.tsv"    "C18_pos.p_cat_dir.tsv"   
##  [7] "C18_pos.p_cat.tsv"        "C18_pos.p_value.tsv"     
##  [9] "C18_pos.p_volc_dir.tsv"   "C18_pos.p_volcano.tsv"   
## [11] "feature_stats.csv"        "HILIC_neg.p_cat_dir.tsv" 
## [13] "HILIC_neg.p_cat.tsv"      "HILIC_neg.p_value.tsv"   
## [15] "HILIC_neg.p_volc_dir.tsv" "HILIC_neg.p_volcano.tsv" 
## [17] "HILIC_pos.p_cat_dir.tsv"  "HILIC_pos.p_cat.tsv"     
## [19] "HILIC_pos.p_value.tsv"    "HILIC_pos.p_volc_dir.tsv"
## [21] "HILIC_pos.p_volcano.tsv"